/*******************************************************************************

This code file cleans PDFs from NYC HPD that I converted to (somewhat messy)
CSVs using the open-source conversion software Tabula. In particular, the HPD
files identify all BBLS where the GEA was expanded to in 2007. The original 
GEA can be identified by the area of Manhattan that was not part of the expansion,
since all of Manhattan lies in the expanded GEA, and the original GEA was 
entirely in Manhattan.

*******************************************************************************/

*** Manage settings

	run "~/Dropbox (MIT)/Research/NYC421a/code/modules/settings.do"
	
*** Import 

	import delimited "$data/raw/GEA/CSV/tabula-manhattan-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile manhattan
	save `manhattan', replace

	import delimited "$data/raw/GEA/CSV/tabula-bronx-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile bronx
	save `bronx', replace
	
	import delimited "$data/raw/GEA/CSV/tabula-queens-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile queens
	save `queens', replace
	
	import delimited "$data/raw/GEA/CSV/tabula-staten-Island-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile staten_island
	save `staten_island', replace
	
	import delimited "$data/raw/GEA/CSV/tabula-Brooklyn-part-one-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile brooklyn_1
	save `brooklyn_1', replace
	
	import delimited "$data/raw/GEA/CSV/tabula-Brooklyn-part-two-GEA.csv", varnames(1) numericcols(1 2 3 4) encoding(ISO-8859-1) clear
	tempfile brooklyn_2
	save `brooklyn_2', replace
	
*** Append together
	
	use `manhattan', clear
	
	append using `bronx'
	append using `queens'
	append using `staten_island'
	append using `brooklyn_1'
	append using `brooklyn_2'
	
*** Clean data

	* Removes PDF table headers on each page
	drop if missing(borocode)
	
	* Drop unneeded vars
	drop effectivedate borocode block lot gea
	
	* Format BBL
	format bbl %18.0f
	
	gen gea_expansion = 1
	
	* Drop duplicates
	duplicates drop
	
*** Save output

	save "$data/clean/gea_bbls.dta", replace
